########Graphing in R########################
#We'll be using mostly basic plots, but we will go over how to take simple graphs and make them 
#publication worthy. My hope is that you will be able to take the commands and script here and apply it to any 
#graphs that you will need for your own work.
#A basic roadmap for today: 
#We will start with simple single variable graphs, and we're right away going to see just how many modifications go into just a simple graph!
#We will be working with a pre-loaded dataset from the "psych" package that will allow us to 
#illustrate most of the types of graphs that we are going to work with. So let's begin by installing the psych package if you haven't already
install.packages("psych")
library(psych)

#Now we load our dataset in...
data(sat.act)
attach(sat.act)

#Let's take a look at our dataset...
View(sat.act)

#This dataset consists of data that were collected online from 700 participants in a personality study
#We have the participants gender, age, level of education, ACT scores, and SAT verbal and quant scores
#We will also be using a number of packages to create our graphs that we need to install/load, so we can do that right

install.packages("lattice")
library(lattice)
#lattice is probably the most widely used (or at least should be...) package for graphics in R

install.packages("plotrix")
library(plotrix)
library(car)
#plotrix is another graphics package, but a bit more specialized than lattice

#the rms package is going to hlep us with some of the regression we will use later
install.packages("rms")
library(rms)

#We'll start by just simulating an X variable with mean of zero and SD of ten
x <- rnorm(10000, 0, 10)
#We will begin with the simplest graph of them all, a histogram
hist(x)
#We have arguments to change both the labels and range of axes
hist(x, xlab= 'X Values', ylab= "y Values", xlim = c(-50, 50), ylim = c(0, 2000))
#There are countless arguments to add, like using proportions, adding colors, and a main title
hist(x, main = "Small preview of Graphing in R!", xlab= paste('Random normal distribution with\nmean of 0 and SD of 10'), 
     ylab= "Y Proportion", xlim = c(-50, 50), ylim = c(0, .045), col = 'blue', freq = F)
#There are hundreds of colors to choose from in R!
colors()
#The lines function adds....you guessed it. This is how we add a density line
lines(density(x), col='red')
#We're now going to create a y with mean 5, SD of 10
y <- rnorm(10000, 5, 10)
#and add that density estimator to the same graph
lines(density(y), col = 'darkgreen')
#We're going to add some text to our graph, with the help of the locator function!
location1 <- locator(1) 
location2 <- locator(1)# On the chart, click where you would like the text to appear
#We can now add text to the graph using that location
text(location1, "mean = 0.0")
text(location2, labels = paste("mean is equal\n to 5.0"))
#Note the '/n' creates a new line
abline(v = 0)
#Abline will put a line wherever we want
curve(x^3 + 3*(x^2) + 7*x + 4)
#The curve function will create a curve according to a function we specify
curve(x^3 + 3*(x^2) + 7*x + 4, from = -10, to = 10)
#We can change the limits of our axes for a better view
curve(x^3 + 3*(x^2) + 7*x + 4, from = -10, to = 10, ylab = expression(beta),
      xlab = expression(Sigma(alpha^3 * zeta) + Pi(c^2)))
#The expression function lets us alphanumeric symbols in addition to reuglar tet
#Before we get started graphing we need to make a quick tweak to our data...
#Gender in our dataset is actually coded as a numeric variable, so we need to code it as dichotomous
sat.act$gender[sat.act$gender == 1] <- 'Male'
sat.act$gender[sat.act$gender == 2] <- 'Female'
sat.act$gender <- factor(sat.act$gender)

#Sometimes when we have discrete groups, we want to "stack" our histograms to show frequencies by group
histStack(ACT, z=gender, las=1, l=TRUE, col=c("darkred", "gold"), xlab="ACT Scores", ylab="Frequencies", main="Histogram of ACT scores")
#A few things are different here, we use the HistStack command to "stack" our histograms. Makes sense, right?
#the "z=gender" separates the histogram into two groups by gender. We also have multiple colors now, so we create a vecotr of colors and use that for the color command
#If you get a warning about how "z was converted to a factor" that's just R's way of saying that gender was numeric, but was converted
#Because I know this dataset, I know there are more females than males, so I know that gold is the females, but let's add a legend
histStack(ACT, z=sat.act$gender, las=1, l=TRUE, col=c("darkred","gold"), xlab="ACT Scores", ylab="Frequencies", main="Histogram of ACT scores", legend.pos="topleft")

#Our histogram is starting to look better
histStack(ACT, z=sat.act$gender, las=1, l=TRUE, col=c("darkred","gold"), xlab="ACT Scores", 
          ylab="Frequencies", main="Histogram of ACT scores", legend.pos="topleft", breaks=5)
#We can "suggest" the number of breaks for our histogram, which will change the number of columns

#The frustrating thing is that we can only"suggest columns", R may not take our advice! In this case, our graph looks the same.
histStack(ACT, z=sat.act$gender, las=1, l=TRUE, col=c("darkred","gold"), xlab="ACT Scores", 
          ylab="Frequencies", main="Histogram of ACT scores", legend.pos="topleft", breaks=seq(0,36, 2))

#We here are instead specifying how often breaks happen, rather than the number. Now our graphs looks quite a bit different!
#I think our old break sequence was more informative, so we'll go back. We can change the limits of both axes
histStack(ACT, z=sat.act$gender, las=1, l=TRUE, col=c("darkred","gold"), xlab="ACT Scores", 
          ylab="Frequencies", main="Histogram of ACT scores", legend.pos="topleft", ylim=c(0, 250))

#We can use the "ylim" or "xlim" function to specify the range of our x- or y-axes
#Let's make some changes that will work with proportions instead of counts
hist(ACT, las=1, l=TRUE, col="snow", xlab="ACT Scores", ylab="Proportions", main="Histogram of ACT scores", freq=F, ylim=c(0.00, 0.10))

#The freq=F command gives us a histogram that is proportions rather than actual frequencies
#Since we're working with densities, let's put a kernel density estimate over our histogram!
eq= density(ACT, bw=2)
#This line is the density function for a kernel density plot. The "bw=2" changes the bandwith of the density estimator
lines(eq, lty=2, lwd=1, col="red")


################################Scatter plots!############################################
#So, we did a lot just working with histograms. We're going to move onto working with scatterplots.
#We'll go through the same process of starting with our boring graph and making something great!
#Still working with our sat.act data, we're going to make a simple scatter plot of SAT Verbal and Quantitative
plot(SATV~SATQ)

#First thing we want to do is label our graph. Using the same commands as before...
plot(SATV~SATQ, xlab="SAT Quantitative Scores", ylab="SAT Verbal Scores", main="Scatterplot of SAT Verbal and Quantitative Scores")

#Now that we have our axes clearly labeled, let's change up what the characters of the points, from boring old circles
plot(SATV~SATQ, pch=5, xlab="SAT Quantitative Scores", ylab="SAT Verbal Scores", main="Scatterplot of SAT Verbal and Quantitative Scores")

#The "pch=" command changes what the figure is that represents each point. The number determines which character is used
#The "cex" command can change the size of our points that are being used. LEt's make our points smaller since we have a bunch grouped close together
plot(SATV~SATQ, pch=5, cex=.5, xlab="SAT Quantitative Scores", ylab="SAT Verbal Scores", main="Scatterplot of SAT Verbal and Quantitative Scores")

# Our points are now much smaller, again let's rotate the numbers on our y-axis
plot(SATV~SATQ, pch=5,las=2, cex=.5, xlab="SAT Quantitative Scores", ylab="SAT Verbal Scores", main="Scatterplot of SAT Verbal and Quantitative Scores")

#Like our histogram, we can also add colors to our points
plot(SATV~SATQ, pch=5, col="burlywood", las=2, cex=.5, xlab="SAT Quantitative Scores", ylab="SAT Verbal Scores", main="Scatterplot of SAT Verbal and Quantitative Scores")

#IF we want fill on our points, that is options 21-25 for pch
plot(SATV~SATQ, pch=21, col="purple", bg="purple", las=1, cex=.5, xlab="SAT Quantitative Scores", ylab="SAT Verbal Scores", main="Scatterplot of SAT Verbal and Quantitative Scores")

#Now let's put in a best fit line
abline(lm(SATV~SATQ))
summary(lm(SATV~SATQ))

#The abline function allows us to fit reference lines, we can choose straight lines from a particular point, or best fit lines
#let's mess with our graphs a bit and add some reference lines. To put lines at the means we need to describe our data first
describe(SATV)
describe(SATQ)
#Below are the means for the variables for our variables
abline(h=612.23)
abline(v=610.22)

#So we've added reference lines at the mean for both Y and X, along with our best fit line
#Let's clean out our graph and get rid of the reference lines
plot(SATV~ SATQ, pch=21, col=c("black","red")[gender], bg=c("black", "red"), las=1, cex=.5, xlab="SAT Quantitative Scores", ylab="SAT Verbal Scores", main="Scatterplot of SAT Verbal and Quantitative Scores")

#Here we are distinguishing between the males and females in our sample
#Let's now create a confidence interval for predictions
prediction <- ols(SATV~SATQ, x=T, y=T)
p <- Predict(prediction, SATQ=seq(200, 800, by=.1), conf.type="mean")
plot(p, ylim=c(200, 800), col="lightblue")

#We can see that our band is pretty close to the line, this is telling us we have accurate predictions
#Let's go back to our original scatterplot divided by gender
plot(SATV~ SATQ, pch=21, col=c("black","red")[gender], bg=c("black", "red"),
las=1, cex=.5, xlab="SAT Quantitative Scores", ylab="SAT Verbal Scores", main="Scatterplot of SAT Verbal and Quantitative Scores")
abline(lm(SATV~SATQ))
text(310, 475, cex=1,font=4, "R-Squared=.42 --->")
#The last type of 2-D scatterplot we will go over is the "smooth scatterplot" It's like a regualr scatterplot....but smooth

smoothScatter (SATV, SATQ, las=1)
#Again, we can see roughly where the majority of the points are, but this can be helpful with a large number of data points.

#We are going to wrap up with some customizations of the fonts
#The Hershey command is where we get to customize our fonts.
?Hershey
smoothScatter (SATV, SATQ, las=1, family="HersheyGothicEnglish", main="Smoother Scatterplot")
smoothScatter (SATV, SATQ, las=1, family="HersheySymbol", main="Smoother Scatterplot")
#So these are definitely different fonts...Let's make some more, normal customizations
smoothScatter (SATV, SATQ, las=1, family="HersheySans", main="Smoother Scatterplot", font=4)
#We can see now our values are bolded and italicized
#When it comes to graphing, fonts are one of the weaknesses in R, the support for fonts is not great.

#######################Multivariate Scatterplots################################
#The first way to look at the correlations between more than two variables in correlation matrices.
scatterplotMatrix(~SATV + SATQ + ACT + gender)
#This type of plot has a lot of information in it. So let's break it down.
#On the diagonal we have the kernel smoothers that we saw with our histograms
scatterplotMatrix(~SATV + SATQ + ACT + gender, diagonal="none")
#We have some other options for the diagonal as well. Let's put histograms instead
scatterplotMatrix(~SATV + SATQ + ACT + gender, diagonal="histogram")
#Gender included in this matrix is making things a bit awkward, we can plot by gender. 
scatterplotMatrix(~SATV + SATQ + ACT |gender, diagonal="histogram")
#this plot looks a little intense, it might be helpful to take a subset of our sample, which we will do shortly

#We can simplify this graph by getting rid of the smoother and the regression lines
scatterplotMatrix(~SATV + SATQ + ACT, diagonal="histogram", reg.line=F, smoother=NULL)

#We can also add ellipses for the off diagonal values that represent concentrations of points on our scatterplot
scatterplotMatrix(~SATV + SATQ + ACT, diagonal="histogram", ellipse=TRUE, levels=c(.5, .99), reg.line=F, smoother=NULL)

#GGplot2 time! GGplot2 makes really elegant plots really easily.
#However the code looks a little different than we're used to
#Let's start with the scatterplot of SAT scores that we had before
library(ggplot2)
#The ggpubr package will help us add some information to our gg plots
install.packages('ggpubr')
library(ggpubr)
#We'll start with making a simple scatterplot
ggplot(sat.act, aes(x=SATQ, y=SATV)) + geom_point()
#there are a few things to note here.
#1. We have more than one function here, the ggplot function starts by creating a ggplot object
#2. The aes function creates the aesthetic mapping, this basically definites all the information we need
#3. The second component actually creates the graph from the object we created.
#4. These two components are bridged using a plus sign, allowing us to continually add components
ggplot(sat.act, aes(x=SATQ, y = SATV)) + geom_point() + theme_dark()
#We have added a theme to our plot. There are some premade themes, like black and white,
#'classic', minimal, and dark (which we have selected here)
ggplot(sat.act, aes(x=SATQ, y = SATV)) + geom_point(aes(color=gender)) + theme_bw()
#Part of defining the aesthetic is specifying gorups, here we can color poitns based on which group they are in
ggplot(sat.act, aes(x=SATQ, y = SATV)) + geom_point(aes(color=gender), show.legend = FALSE) + theme_bw() +
  geom_smooth(method = 'loess')
#the geom_smooth creates a smoothing line, here we selected LOcal regrESSion
ggplot(sat.act, aes(x=SATQ, y = SATV)) + geom_point(aes(color=gender), show.legend = FALSE) + theme_bw() +
  geom_smooth(method = 'lm', se=FALSE)
#Here we implement a more standard regression line (and take away the SEs)
scatter1 <- ggplot(sat.act, aes(x=SATQ, y = SATV)) + geom_point(aes(color=gender), show.legend = FALSE) + theme_bw() +
  geom_smooth(aes(color=gender), method = 'lm', se=FALSE)
#Here we use the aesthetic mapping for both the points and the regression lines
#This allows as to see the regression line for both males and females
scatter1
#Here we are keeping the same plot, but now we're adding some statistics to our plot
scatter1 + stat_cor()
scatter2 <- ggplot(sat.act, aes(x=SATQ, y = SATV)) + geom_point() + theme_classic() +
  geom_smooth(method = 'lm', se=FALSE) + facet_wrap(~gender)
#the facet_wrap function is also good for groups, but instead of plotting two groups together
#we are creating two separate plots
scatter2
scatter2 + stat_cor()
#Here we are now adding the statistic to each plot
ggplot(sat.act, aes(x=SATQ, y = SATV)) + geom_point() + theme_minimal() +
  geom_smooth(method = 'lm') + labs(x= 'SAT Quant', y = 'SAT Verbal', title='Scatterplot',
                                              caption = 'All the labels!!')
#the labs function allows for easy creation of labels, including titles, captions and axes
ggplot(sat.act, aes(x=SATQ, y = SATV)) + geom_point() + theme_bw() +
  geom_smooth(method = 'lm') + scale_x_reverse() + scale_y_log10()
#We can make easy modifications to our axes, we are reversing entirely our x-axis,
#We are rescaling our y axis to be on a log-scale
ggplot(sat.act, aes(x=age, y = SATV)) + geom_point()
ggplot(sat.act, aes(x=age, y = SATV)) + geom_point() + geom_jitter()
#The jitter function adds a little bit of 'noise' to teh data, this is handy when we have points that overlap
ggplot(sat.act) + stat_qq(aes(sample = SATV, color = gender))
#WE can even use ggplot2 to check regression assumptions, including by gender
box <- ggplot(sat.act, aes(x=gender, y = SATV)) + geom_boxplot()
#To make different graphs, all we need to do is switch functions and variables
box
#And we can see if our differences are significant
box + stat_compare_means(method= 't.test', label='p.signif')
#WE can easily incorporate more than two groups
education_box <- ggplot(sat.act, aes(x=as.factor(education), y =SATV)) + geom_boxplot()
education_box
#And our ANOVA stats to the graph
education_box + stat_compare_means(method='anova')
#OR compare individual means, using a reference group
education_box + stat_compare_means(label='p.signif', ref.group = 6)

#Time series plots are also easy to create in ggplot2
ggplot(economics, aes(date, pce)) + geom_line() + theme_bw()
#Just like before we can add a smoothing line
ggplot(economics, aes(date, pce)) + geom_point() + theme_bw() + geom_smooth()
#We can also take two time series and put them side-by side
g1 <- ggplot(economics, aes(date, psavert)) + geom_point() + theme_bw() + geom_smooth()
g2 <- ggplot(economics, aes(date, unemploy)) + geom_point() + theme_bw() + geom_smooth()
install.packages('cowplot')
library(cowplot)
p <-plot_grid(g1, g2, labels=c("Savings", "Unemployment"), ncol = 2, nrow = 1)
p
#Finally, we'll save our final plot...
save_plot("finalplot.png", p)
#The plot should then be saved in your working directory. As easy as that!
#Most of ggplot2 follows the same basic roadmap, we begin by creating a ggplot object,
#We specify the type of graph we want, then add functions to make whatever modifications we need
#So, rather then specifying one whole graph, we specify a piece here and there and can end up makign awesome graphs
#Be on the look out next quarter for our R programming...
#Intermediate Topics in R
#Multilevel Modeling in R
#Machine Learning in R